'''
使用方法：

链接：
url = 'http://www.23us.so/files/article/html/6/6079/index.html'
写入所要信息正则：
regex = '<td class="L"><a href=".*?">(.*?)</a></td>'
实例化一下SpiderManXia:   
s = SpiderManXia(url)
调取find_info属性返回的是一信息列表：
print(s.find_info(regex)[:10])


'''
import requests,re

class SpiderManXia:
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'}
    
    def __init__(self,url,charset = 'utf-8'):
        self.url = url
        self.charset = charset
        self.html = self.get_h5()
    def find_info(self,regex):
        info = re.findall(regex,self.html)
        return info
    def get_h5(self):
        resp = requests.get(self.url,headers = self.headers)
        resp.encoding = self.charset
        return resp.text
    
if __name__ == '__main__':       
    url = 'http://www.23us.so/files/article/html/6/6079/index.html'
    regex1 = '<dd><h3>(.*?)</h3></dd>'
    regex2 = '<td class="L"><a href=".*?">(.*?)</a></td>'
    
    s = SpiderManXia(url)
    print(s.find_info(regex2)[:10])

























        
